Open Source in Environmental Sustainability#

from IPython.display import display, HTML

import dateparser
import datetime
import handcalcs.render
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
import pycountry
from pycountry_convert import country_alpha2_to_continent_code, country_alpha3_to_country_alpha2
# Clean up the dataset
def name_to_iso3(x):
    """Perform a fuzzy search for UK-like strings
    Arguments:
        x - a string with a country name
        
    Outputs: 
        A string with ISO3 name standard for the UK
        
    """
    
    if x == "UK":
        x = "United Kingdom"
    try:
        iso3 = pycountry.countries.search_fuzzy(x)[0].alpha_3
    except:
        iso3 = ""
    return iso3

def alpha3_to_alpha2(x):
    """Convert country code ISO 3166-1 alpha-3 to country code ISO 3166-1 alpha-2 .
    Arguments:
        x - a string with a country name following ISO 3166-1 alpha-3 standard
        
    Outputs: 
        A string with a country name following country code ISO 3166-1 alpha-2
        
    """
    
    try:
        alpha_2 = country_alpha3_to_country_alpha2(x)
    except:
        alpha_2 = ""
    return alpha_2


def alpha2_to_continent(x):
    """Convert country code ISO 3166-1 alpha-2 to continent name
    Arguments:
        x - a string with a country name following ISO 3166-1 alpha-2 standard
        
    Outputs: 
        A string with a continent name
        
    """
    
    try:
        continent = country_alpha2_to_continent_code(x)
    except:
        continent = ""
    return continent


def upper_string(lower_string):
    """Apply title format
    Arguments:
        lower_string - a string 
    Outputs: 
        A string with a title format
        
    """
    
    return lower_string.title()

def calc_age(start_date):
    """Calculate age in years between now and start_date
    Arguments:
        start_date - a date
    Outputs: 
        A float with number of years between now and start_date
        
    """
    return (datetime.datetime.now() - dateparser.parse(start_date, settings={'TIMEZONE': 'CEST'})).days/365

def count_strings(comma_seperated_string):
    """Count number of delimiters (commas) in a string 
    Arguments:
        comma_seperated_string - a string containing commas
    Outputs: 
        A number (int) of commas found in comma_seperated_string
        
    """
    
    if type(comma_seperated_string) == str:
        return comma_seperated_string.count(",")
    else:
        return 0

Set Default Plotting Options#

# default plotting options
# Palette https://coolors.co/palette/0e7c7b-17bebb-ffc857-e9724c-c5283d
height = (800,)  # Added parameter
color_continuous_scale = px.colors.sequential.Aggrnyl[::-1]
marker_color = "#0E7C7B"
color_discrete_sequence = ["#0E7C7B", "#17BEBB", "#FFC857", "#E9724C", "#C5283D"]

# Register your theme as a named template
pio.templates["OpenSustain"] = go.layout.Template(
    layout=dict(
        margin=dict(l=10, r=0, t=30, b=0),
        font=dict(
            family="Open Sans",
            color="#040404",
            size=15,
        ),
        title_font_family="Google Font",
        title_font_color="#040404",
    ),
)

# Combine your theme with plotly's default
pio.templates.default = "plotly+OpenSustain"
df_raw = pd.read_csv("./csv/projects.csv")
df_raw.rename(columns={"rubric": "topic"},inplace=True)
df_raw.rename(columns={"topics": "labels"},inplace=True)

Calculate Age in Years#

# Age plots are better in years
df_raw["project_age_in_years"] = df_raw["project_age_in_days"].apply(lambda x: x / 365)
max_age_in_years = 8.0

Basis Statistics#

First let us get a routh overview of the project dataset

fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Dimension", "Value"],line_color='#000000',
                        fill_color='#ffffff', font_size=18 ,  ),
            cells=dict(
                        fill_color='#ffffff',
                        line_color='#ffffff',
                        font_size=16,
                        height=30,
                values=[
                    [
                        "Total number of projects",
                        "Github projects",
                        "Gitlab projects",
                        "Other platforms",
                        "Number of projects in personal namespace",
                        "Total stars of all projects",
                        "Total contributers of all projects",
                        "Active GitHub projects",
                        "Inactive GitHub projects",
                        "Projects with contribution guide in %",
                        "Projects with code of conduct in %",
                        "Projects accepting donations in %",
                        "Median number of commits",
                        "Median stargazers",
                        "Median stars last year",
                        "Median Development Distribution Score",
                        "Median number of contributors",
                        "Median closed issues last year",
                        "Median commits last year",
                        "Median age in years",
                    ],
                    [
                        df_raw["project_name"].count(),
                        df_raw["platform"].value_counts()["github"],
                        df_raw["platform"].value_counts()["gitlab"],
                        df_raw["platform"].value_counts()["custom"],
                        df_raw["project_name"].count() - df_raw["organization"].count(),
                        df_raw["stargazers_count"].sum(),
                        df_raw["contributors"].sum(),
                        df_raw["project_active"].value_counts()[True],
                        df_raw["project_active"].value_counts()[False],
                        round(df_raw["contribution_guide"].value_counts(normalize=True)[True]*100,2),
                        round(df_raw["code_of_conduct"].value_counts(normalize=True)[True]*100,2),
                        round(df_raw["accepts_donations"].value_counts(normalize=True)[True]*100,2),
                        df_raw["total_number_of_commits"].median(),
                        df_raw["stargazers_count"].median(),
                        df_raw["stars_last_year"].median(),
                        round(df_raw["development_distribution_score"].median(),4),
                        df_raw["contributors"].median(),
                        df_raw["issues_closed_last_year"].median(),
                        df_raw["total_commits_last_year"].median(),
                        round(df_raw["project_age_in_years"].median(),2),
                        
                    ],
                ]
            ),
        )
    ]
)



fig.update_layout(
height=1000,
width=700
)
fig.show()

Development Distribution Score#

The Development Distribution Score (DDS) weights how the development is distributed between projects contributors by setting contributor with the most commits in relation with the other contributors. Distribution of knowledge, work, and governance of an project ensure sustainability. When people are leaving a project or don’t find time anymore for an open source project other can still continue and jump into leading positions.

DDS is created in the preprocessing script and is similar to the bus factor. It is only based on quantiative values derived from git statistics. This value is calculated in preprocessing.

Filter Data#

df_active = df_raw.copy()
# Filter out the inactive project for further analysis
df_active = df_active[(df_active["project_active"] == True)]
# Ciruated Lists are no classical open source projects and are not included into the analysis
df_active = df_active[(df_active["topic"] != "Curated Lists")]
# Filter out the projects not on the GitHub platform
df_active = df_active[(df_active["platform"] == "github")]

Score Projects#

# Calculate the scores on activity, community and size
df_active["activity"] = (
    df_active["total_commits_last_year"].rank(pct=True)
    + df_active["issues_closed_last_year"].rank(pct=True)
    + df_active["days_until_last_issue_closed"].rank(pct=True)
    + df_active["last_released_date"].rank(pct=True, na_option="top")
)

df_active["community"] = (
    df_active["contributors"].rank(pct=True)
    + df_active["development_distribution_score"].rank(pct=True)
    + df_active["reviews_per_pr"].rank(pct=True)
)

df_active["size"] = (
    df_active["total_number_of_commits"].rank(pct=True)
    + df_active["contributors"].rank(pct=True)
    + df_active["closed_issues"].rank(pct=True)
    + df_active["closed_pullrequests"].rank(pct=True)
)

# All scores are weighted equal and normalized to one
df_active["total_score"] = (
    df_active["activity"] / df_active["activity"].max()
    + df_active["community"] / df_active["community"].max()
    + df_active["size"] / df_active["size"].max()
) / 3
# Save the dataset with the scores
df_active_path = "./csv/project_analysis.csv"
df_active.to_csv(df_active_path)
%%render
## The calcluation within this cell shall reader give an understanding on how the DDS is been calculated. 
## Values calculated here are not used in any other cell.
n_MaxCommitsSingleContributor = 90
n_total_commits = 100


DDS = 1 - n_MaxCommitsSingleContributor / n_total_commits
\[\begin{split} \begin{aligned} & \textrm{ The calcluation within this cell shall reader give an understanding on how the DDS is been calculated. }\\[10pt] & \textrm{ Values calculated here are not used in any other cell.}\\[10pt] n_{MaxCommitsSingleContributor} &= 90 \; \\[10pt] n_{total_{commits}} &= 100 \; \\[10pt] \mathrm{DDS} &= 1 - \frac{ n_{MaxCommitsSingleContributor} }{ n_{total_{commits}} } = 1 - \frac{ 90 }{ 100 } &= 0.100 \end{aligned} \end{split}\]
### KK: this is where a clear object naming convention + comments would really help: is syntax df[df_raw[..]] appropriate here? 
### KK: it might be helpful to plot boxplots for the below scores per category to better show their distribution, including median

df_personal_projects = df_active[df_active["organization"].isna()]
df_organization_projects = df_active[df_active["organization"].notna()]
df_inactive = df_raw[(df_raw["project_active"] == False)]
df_top_stargazers = df_active[(df_active["stargazers_count"] > 100)]

fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Group", "Median DDS"],line_color='#000000',fill_color='#ffffff',font_size=18),
            cells=dict(
                        line_color='#ffffff',fill_color='#ffffff', font_size=16, height =30,
                values=[
                    [
                        "All projects",
                        "Active projects in personal namespace",
                        "Active organization projects",
                        "Active projects",
                        "Inactive projects",
                        "Active projects over than 50 Stars",
                        "Projects with most contributors"

                    ],
                    [
                        round(df_raw["development_distribution_score"].median(),3),
                        round(df_personal_projects["development_distribution_score"].median(),3),
                        round(df_organization_projects["development_distribution_score"].median(),3),
                        round(df_active["development_distribution_score"].median(),3),
                        round(df_inactive["development_distribution_score"].median(),3),
                        round(df_top_stargazers["development_distribution_score"].median(),3),
                        round(df_active.nlargest(50, "contributors")["development_distribution_score"].median(),3)
                    ],
                ]
            ),
        )
    ]
)

fig.update_layout(
width=700

)

fig.show()
df_active.iloc[300]
project_name                                                                   EVCC
oneliner                          An extensible EV Charge Controller with PV int...
git_namespace                                                                 andig
git_url                                         https://github.com/evcc-io/evcc.git
platform                                                                     github
labels                            mqtt,golang,pv,wallbox,emobility,charger,wallb...
topic                                                   Mobility and Transportation
last_commit_date                                               2022/09/04, 20:03:13
stargazers_count                                                              759.0
number_of_dependents                                                           23.0
stars_last_year                                                               492.0
project_active                                                                 True
dominating_language                                                              Go
organization                                                                    NaN
organization_user_name                                                      evcc-io
languages                         Go,Vue,JavaScript,Smarty,CSS,Shell,Makefile,Do...
homepage                                                            https://evcc.io
refs                                                                            NaN
project_created                                                2019/12/06, 16:27:04
project_age_in_days                                                          1003.0
license                                                                         MIT
total_commits_last_year                                                      1096.0
total_number_of_commits                                                      2190.0
last_issue_closed                                              2022/09/04, 21:01:01
open_issues                                                                    39.0
closed_pullrequests                                                          1394.0
closed_issues                                                                2266.0
issues_closed_last_year                                                      1257.0
days_until_last_issue_closed                                                    0.0
open_pullrequests                                                              23.0
reviews_per_pr                                                                  1.1
development_distribution_score                                             0.222484
last_released_date                                             2022/08/13, 11:31:37
last_release_tag_name                                                         0.100
good_first_issue                                                                0.0
contributors                                                                   57.0
accepts_donations                                                              True
donation_platforms                github,patreon,open_collective,ko_fi,tidelift,...
code_of_conduct                                                               False
contribution_guide                                                            False
dependents_repos                  JanDragon/evcc,opensprinklershop/evcc,matspi/e...
organization_name                                                               NaN
organization_github_url                                  https://github.com/evcc-io
organization_website                                                https://evcc.io
organization_location                                                       Germany
organization_country                                                            NaN
organization_form                                                               NaN
organization_avatar               https://avatars.githubusercontent.com/u/813835...
organization_public_repos                                                       NaN
organization_created                                                            NaN
organization_last_update                                                        NaN
project_age_in_years                                                       2.747945
activity                                                                   2.871102
community                                                                  2.193867
size                                                                       3.682952
total_score                                                                0.856184
Name: 406, dtype: object

Process Active GitHub Projects#

# Read the scored dataset and configure the plotting backend
df_active = pd.read_csv(df_active_path)

Start Plotting#

license_his = (
    df_active["license"]
    .value_counts()
    .to_frame()
    .rename_axis("license_names")
    .reset_index()
)
fig = px.pie(license_his, values="license", names="license_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Licenses", showlegend=False, font_size=16)
fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
fig.show()
# alternative to the pie chart in cell 23
# main point: ~80% of all open source licences fall under 5 types
main_license_types = ['BSD-3-Clause', 'MIT', 'GPL-3.0', 'CUSTOM', 'Apache-2.0']
alt_df_active = df_active.copy()

alt_df_active['pooled_license'] = np.where(
     alt_df_active['license'].isin(main_license_types), alt_df_active['license'], 'Other')

alt_license_his = (
    alt_df_active["pooled_license"]
    .value_counts()
    .to_frame()
    .rename_axis("license_names")
    .reset_index()
)
alt_fig = px.pie(alt_license_his, values="pooled_license", names="license_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)

alt_fig.update_layout(title="Distribution of Licenses", showlegend=False, font_size=16)
alt_fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
alt_fig.show()
fig = px.histogram(
    df_active,
    x="project_age_in_years",
    nbins=50,
    title="Distribution of Project Age in Years",
)
fig.update_layout(
    yaxis_title= None,
    xaxis_title="Project Age",
)
fig.update_traces(marker_color=marker_color)
fig.show()
fig = px.histogram(
    df_active,
    x="total_number_of_commits",
    nbins=50,
    title="Distribution of Total Commits",
)
fig.update_layout(
    yaxis_title="Projects",
    xaxis_title="Project Total Commits",
)
fig.update_traces(marker_color=marker_color)
fig.show()
## KK:Same as above: what's the message? How else can we group the data to get it across? 

fig = px.pie(df_active.groupby('topic')['contributors'].sum().reset_index(), values="contributors", names="topic", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Contributors within Topics", height=1200, showlegend=False)
fig.update_traces(textposition='outside', textinfo='value+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
topic_his = (
    df_active["topic"]
    .value_counts()
    .to_frame()
    .rename_axis("topic_names")
    .reset_index()
)

fig = px.bar(
    df_active.groupby('topic')['contributors'].sum().reset_index().sort_values('contributors',ascending=[False]),
    x="contributors",
    y="topic",
    orientation="h",
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title= None,
    xaxis_title="Contributors",
    title="Contributors within Topics",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
    )
)
fig.update_traces(marker_color=marker_color)
fig.update(layout_showlegend=False)
topic_his = (
    df_active["topic"]
    .value_counts()
    .to_frame()
    .rename_axis("topic_names")
    .reset_index()
)

fig = px.bar(
    topic_his,
    x="topic",
    y="topic_names",
    orientation="h",
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title= None,
    xaxis_title="Projects",
    title="Projects within Topics",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
    )
)
fig.update_traces(marker_color=marker_color)
fig.update(layout_showlegend=False)
license_dominating_language = (
    df_active["dominating_language"]
    .value_counts()
    .to_frame()
    .rename_axis("dominating_language_names")
    .reset_index()
)
license_dominating_language
license_dominating_language = license_dominating_language[(license_dominating_language["dominating_language"] > 4)]
fig = px.pie(license_dominating_language, values="dominating_language", names="dominating_language_names", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Programming Languages", showlegend=True, font_size=16,height=800)
fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=1)))
fig.show()
# KK I thing the question that we should be asking: are there similar patterns followed by most topics? If so, whatare they? If not, what are the fields that stand out and what is the difference?

# df_sorted = df.groupby(['topic'], as_index=False)['dominating_language'].agg('sum')
df_language_distribution = (
    df_active.value_counts(["topic", "dominating_language"]).to_frame().reset_index()
)

df_language_distribution.rename(columns={0: "counts"}, inplace=True)
fig = px.scatter(
    df_language_distribution, x="dominating_language", y="topic", size="counts", 
)


fig.update_layout(
    height=1000,  # Added parameter
    width=1200,
    xaxis_title="Dominating Language",
    yaxis_title= None,
)
fig.update_traces(marker_color=marker_color)


fig.show()
# KK I thing the question that we should be asking: are there similar patterns followed by most topics? If so, whatare they? If not, what are the fields that stand out and what is the difference?

# df_sorted = df.groupby(['topic'], as_index=False)['dominating_language'].agg('sum')
df_license_distribution = (
    df_active.value_counts(["topic", "license"]).to_frame().reset_index()
)

df_license_distribution.rename(columns={0: "counts"}, inplace=True)
fig = px.scatter(df_license_distribution, x="license", y="topic", size="counts")


fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="License",
    yaxis_title=None,
    title="License Distribution over Topic",
    autosize=True,
)
fig.update_traces(marker_color=marker_color)


fig.show()
fig = px.histogram(
    df_active,
    x="contributors",
    nbins=100,
    title=" Contributors",
)
fig.update_layout(
    yaxis_title="Projects",
    xaxis_title="Contributors",
)
fig.update_traces(marker_color=marker_color)
fig.show()
most_listed_projects = df_active["git_namespace"].value_counts(ascending=False).to_frame().rename_axis("Namespace").reset_index().rename(columns={"git_namespace": "counts"})
fig = go.Figure(data=[go.Table(
    header=dict(values=list(most_listed_projects.columns), line_color='#000000', fill_color='#ffffff',font_size=18 ),
    cells=dict(line_color='#ffffff', fill_color='#ffffff', font_size=16, height=30, values=[most_listed_projects.Namespace, most_listed_projects.counts])
)])

fig.update_layout(
autosize=False,
)

fig.show()
oldest_projects = df_active.nlargest(40, "project_age_in_years")


fig = px.bar(
    oldest_projects,
    x=oldest_projects["project_age_in_years"],
    y=oldest_projects["project_name"],
    orientation="h",
    range_x=(9.6, 14),
    hover_name=oldest_projects["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=oldest_projects["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title=None,
    xaxis_title="Project Age in Years",
    title="The oldest Projects still active",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
    )
)



fig.update(layout_showlegend=False)
contributors = df_active.nlargest(40, "contributors")

fig = px.bar(
    contributors,
    x=contributors["contributors"],
    y=contributors["project_name"],
    orientation="h",
    title="Projects with most contributors",
    hover_name=contributors["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=contributors["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1200,  # Added parameter
    xaxis_title="Contributors",
    yaxis_title= None,
    title="Projects with the most contributors",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
    )
)

fig.update(layout_showlegend=False)
top_stargazers = df_active.nlargest(40, "stargazers_count")

fig = px.bar(
    top_stargazers,
    x=top_stargazers["stargazers_count"],
    y=top_stargazers["project_name"],
    orientation="h",
    hover_name=top_stargazers["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=top_stargazers["development_distribution_score"],
    color_continuous_scale=color_continuous_scale

)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Stars",
    yaxis_title=None,
    title="Projects with the most Stars",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
    )
)

fig.update(layout_showlegend=False)
df_top_100_stargazers = df_active[(df_active["stargazers_count"]) > 100].copy()
df_top_100_stargazers["star_growth"] = (
    df_top_100_stargazers["stars_last_year"] / df_top_100_stargazers["stargazers_count"]
)

df_top_40_star_growth = df_top_100_stargazers.nlargest(40, "star_growth")
fig = px.bar(
    df_top_40_star_growth,
    x=df_top_40_star_growth["star_growth"] * 100,
    y=df_top_40_star_growth["project_name"],
    orientation="h",
    hover_name=df_top_40_star_growth["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=df_top_40_star_growth["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Star Growth last Year [%]",
    yaxis_title= None,
    title="Projects with the highest Star Growth",
    hoverlabel=dict(
    bgcolor="white"),
    coloraxis_colorbar=dict(
    title="DDS",
    ),
)
df_top_40_growth = df_active.nlargest(40, "total_commits_last_year")
df_top_40_growth = df_top_40_growth[df_top_40_growth["project_name"] != "ElexonDataPortal"]
fig = px.bar(
    df_top_40_growth,
    x=df_top_40_growth["total_commits_last_year"],
    y=df_top_40_growth["project_name"],
    orientation="h",
    color=df_top_40_growth["development_distribution_score"],
    hover_name=df_top_40_growth["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color_continuous_scale=color_continuous_scale,
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Commit Growth last Year [%]",
    yaxis_title= None,
    title="Projects with the highest Commit Growth",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
)
)
df_total_score = df_active.nlargest(40, "total_score")

fig = px.bar(
    df_total_score,
    x=df_total_score["total_score"],
    y=df_total_score["project_name"],
    orientation="h",
    range_x=(0.85, 1),
    hover_name=df_total_score["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color = df_total_score["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Total Score",
    yaxis_title=None,
    title="Top Total Score",
    coloraxis_colorbar=dict(
    title="DDS",
    ),   
    hoverlabel=dict(
    bgcolor="white"
)
)
fig.update(layout_showlegend=False)
df_activity_score = df_active.nlargest(40, "activity")

fig = px.bar(
    df_activity_score,
    x=df_activity_score["activity"],
    y=df_activity_score["project_name"],
    orientation="h",
    range_x=(2.9, 3.2),
    hover_name=df_activity_score["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=df_activity_score["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Activity Score",
    yaxis_title=None,
    title="Projects with the highest Activity Score",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
)
)

fig.update(layout_showlegend=False)
df_size_score = df_active.nlargest(40, "size")

fig = px.bar(
    df_size_score,
    x=df_size_score["size"],
    y=df_size_score["project_name"],
    orientation="h",
    range_x=(3.75, 4),
    hover_name=df_size_score["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=df_size_score["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    xaxis_title="Size Score",
    yaxis_title=None,
    title="Projects with the highest Size Score",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
)
)

fig.update(layout_showlegend=False)
# KK: I'd suggest selecting a few most interesting examples conveying a message and put plots with raw data in the Appendix

fig = px.scatter(
    df_active.query("project_age_in_years<@max_age_in_years"),
    x="project_age_in_years",
    y="topic",
    size="size",
    color="total_score",
    hover_name="git_url",
    hover_data=["oneliner","topic","git_namespace"],
    size_max=20,
)

fig.update_layout(
    coloraxis_colorbar=dict(title="Total Score"),
    height=1000,  # Added parameter
    xaxis_title="Project Age in Years",
    yaxis_title=None,
    title="Total Score of Projects",
    hoverlabel=dict(
    bgcolor="white"
)
)


fig.show()
# KK: I'd suggest selecting a few most interesting examples conveying a message and put plots with raw data in the Appendix

fig = px.scatter(
    df_organization_projects.query("project_age_in_years<@max_age_in_years"),
    x="project_age_in_years",
    y="topic",
    size="size",
    color="development_distribution_score",
    hover_name="git_url",
    hover_data=["oneliner","topic","git_namespace"],
    size_max=20,
)

fig.update_layout(
    coloraxis_colorbar=dict(
        title="DDS",
    ),
    yaxis_title=None,
    xaxis_title="Project Age in Years",
    height=1000,  # Added parameter
    title="Development Distribution Score",
    hoverlabel=dict(
    bgcolor="white"
)
)
fig.show()
personal_stargazers = df_personal_projects.nlargest(40, "stargazers_count")

fig = px.bar(
    personal_stargazers,
    x=personal_stargazers["stargazers_count"],
    y=personal_stargazers["git_namespace"],
    orientation="h",
    hover_name=personal_stargazers["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=personal_stargazers["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title=None,
    xaxis_title="Stars",
    title="Projects with most Stars in User Namespace",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
)
)


fig.update(layout_showlegend=False)
# KK: can topics be grouped in fewer categories? can DDS be bucketed into categories, e.g. 0.3>=, 0.3<=&<=0.6, 0.6>=? Do we need to show all three variables, projects, DDS and dependents? 

df_active["dependents_count"] = df_active["dependents_repos"].apply(count_strings)

most_dependent_projects = df_active.nlargest(50, "dependents_count")
most_dependent_projects = most_dependent_projects[most_dependent_projects["project_name"] != "Mission Support System"]
print("DDS of most used Python project:",round(most_dependent_projects["development_distribution_score"].median(),3))


fig = px.bar(
    most_dependent_projects,
    x=most_dependent_projects["dependents_count"],
    y=most_dependent_projects["project_name"],
    orientation="h",
    hover_name=most_dependent_projects["git_url"],
    hover_data=["oneliner","topic","git_namespace"],
    color=most_dependent_projects["development_distribution_score"],
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(
    height=1000,  # Added parameter
    yaxis_title=None,
    xaxis_title="Dependents",
    title="Most used Python Projects vs. DDS",
    coloraxis_colorbar=dict(
    title="DDS",
    ),
    hoverlabel=dict(
    bgcolor="white"
)
)
DDS of most used Python project: 0.436

Process the organizations#

df_organizations = pd.read_csv("./csv/github_organizations.csv")
df_organizations["ISO_3"] = df_organizations["location_country"].apply(name_to_iso3)
df_organizations["ISO_3_alpha2"] = df_organizations["ISO_3"].apply(alpha3_to_alpha2)
df_organizations["continent"] = df_organizations["ISO_3_alpha2"].apply(alpha2_to_continent)
continent_his = df_organizations["continent"].value_counts().to_frame().rename_axis("continent_name")
continent_his.rename(index={"EU": "Europe", "NA": "North America", "": "Global", "OC":"Oceania", "AS":"Asia", "SA":"South America", "AF":"Africa"},inplace=True)

print(continent_his)
fig = px.pie(continent_his.reset_index(), values="continent", names="continent_name", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Organizations between Continents", font_size=16, showlegend=False)
fig.update_traces(textposition='outside', textinfo='label+percent', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
                continent
continent_name           
Europe                203
North America         191
Global                176
Oceania                19
Asia                   12
South America           6
Africa                  4
# alternative to plotin cell 52

alt_df_organizations = df_organizations.copy()
vals_to_replace = {"EU": "Europe", "NA": "North America", "": "Global", "OC":"Other", "AS":"Other", "SA":"Other", "AF": "Other"}
alt_df_organizations['continent'] = alt_df_organizations['continent'].map(vals_to_replace)

alt_continent_his = alt_df_organizations["continent"].value_counts().to_frame().rename_axis("continent_name")


print(alt_continent_his)
alt_fig = px.pie(alt_continent_his.reset_index(), values="continent", names="continent_name", color_discrete_sequence=color_discrete_sequence, hole=0.2)

alt_fig.update_layout(title="Distribution of Organizations between Continents", font_size=16, showlegend=False)
alt_fig.update_traces(textposition='outside', textinfo='label+percent', marker=dict(line=dict(color='#000000', width=2)))
alt_fig.show()
                continent
continent_name           
Europe                203
North America         191
Global                176
Other                  41
## https://octoverse.github.com/
values = {31.5,31.2,27.3,5.9,2.3,1.7}
index_labels=['Oceania','Africa','South America','Europe','Asia','North America']
df_users_continent_cotoverse = pd.DataFrame(values,index=index_labels).reset_index()
# similar pooling to the one in cell 53 could be done here for Africa + Oceania

fig = px.pie(df_users_continent_cotoverse, values=0, names="index", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Users between Continents", font_size=16, showlegend=False)
fig.update_traces(textposition='outside', textinfo='label+percent', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
organization_his = (
    df_organizations["form_of_organization"]
    .value_counts()
    .to_frame()
    .rename_axis("organization")
    .reset_index()
)

organization_his["organization"] = organization_his["organization"].apply(upper_string)
print(organization_his)
fig = px.pie(organization_his, values="form_of_organization", names="organization", color_discrete_sequence=color_discrete_sequence, hole=0.2)

fig.update_layout(title="Distribution of Organizational Forms", font_size=16, showlegend=False)
fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=2)))
fig.show()
        organization  form_of_organization
0          Community                   160
1           Academia                   144
2  Government Agency                    99
3         For-Profit                    85
4         Non-Profit                    65
5      Collaboration                    58
df_countries = (
    df_organizations["ISO_3"]
    .value_counts()
    .to_frame()
    .rename_axis("country")
    .reset_index()
)
df_countries = df_countries.rename(columns={"ISO_3": "counts"})

fig = px.choropleth(
    df_countries,
    locations="country",
    locationmode="ISO-3",
    color="counts",
    color_continuous_scale=color_continuous_scale
)

fig.update_layout(title="Distribution of Organizational Locations Worldwide",
                    coloraxis_colorbar=dict(
                    title="Organizations",
                    ),)

fig.show()
df_public_repos = df_organizations.nlargest(40, "organization_public_repos")
df_organizations["organizations_age_in_years"] = df_organizations["organization_created"].apply(calc_age)
fig = px.scatter(
    df_organizations.query("organizations_age_in_years<@max_age_in_years"),
    x="organizations_age_in_years",
    y="location_country",
    size="organization_public_repos",
    color="form_of_organization",
    hover_name="organization_website",
    hover_data=["organization_name"],
    size_max=20,
    color_continuous_scale=color_continuous_scale,
).for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))


fig.update_layout(
    coloraxis_colorbar=dict(
        title="DDS",
    ),
    yaxis_title=None,
    xaxis_title="Project Age in Years",
    height=1000,  # Added parameter
    title="Organizations forms within different countries",
    hoverlabel=dict(
    bgcolor="white"
)
)
fig.show()
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In [47], line 1
----> 1 fig = px.scatter(
      2     df_organizations.query("organizations_age_in_years<@max_age_in_years"),
      3     x="organizations_age_in_years",
      4     y="location_country",
      5     size="organization_public_repos",
      6     color="form_of_organization",
      7     hover_name="organization_website",
      8     hover_data=["organization_name"],
      9     size_max=20,
     10     color_continuous_scale=color_continuous_scale,
     11 ).for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))
     14 fig.update_layout(
     15     coloraxis_colorbar=dict(
     16         title="DDS",
   (...)
     24 )
     25 )
     26 fig.show()

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/plotly/graph_objs/_figure.py:810, in Figure.for_each_trace(self, fn, selector, row, col, secondary_y)
    764 def for_each_trace(
    765     self, fn, selector=None, row=None, col=None, secondary_y=None
    766 ) -> "Figure":
    767     """
    768 
    769     Apply a function to all traces that satisfy the specified selection
   (...)
    808 
    809     """
--> 810     return super(Figure, self).for_each_trace(fn, selector, row, col, secondary_y)

File /opt/hostedtoolcache/Python/3.8.14/x64/lib/python3.8/site-packages/plotly/basedatatypes.py:1306, in BaseFigure.for_each_trace(self, fn, selector, row, col, secondary_y)
   1262 """
   1263 Apply a function to all traces that satisfy the specified selection
   1264 criteria
   (...)
   1301     Returns the Figure object that the method was called on
   1302 """
   1303 for trace in self.select_traces(
   1304     selector=selector, row=row, col=col, secondary_y=secondary_y
   1305 ):
-> 1306     fn(trace)
   1308 return self

Cell In [47], line 11, in <lambda>(t)
      1 fig = px.scatter(
      2     df_organizations.query("organizations_age_in_years<@max_age_in_years"),
      3     x="organizations_age_in_years",
      4     y="location_country",
      5     size="organization_public_repos",
      6     color="form_of_organization",
      7     hover_name="organization_website",
      8     hover_data=["organization_name"],
      9     size_max=20,
     10     color_continuous_scale=color_continuous_scale,
---> 11 ).for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))
     14 fig.update_layout(
     15     coloraxis_colorbar=dict(
     16         title="DDS",
   (...)
     24 )
     25 )
     26 fig.show()

IndexError: list index out of range